In the last post, I have acquired the data from Amazon and done pre-processing and converted into corpus and done a word cloud. In this blog I plan to tidy data more and analysis data using visualizations.
Loading the libraries
knitr::opts_chunk$set(echo =TRUE)
Reading the data
reviews <-read_csv("amazonreview.csv")
Summary of the data
clean_text <-function (text) {str_remove_all(text," ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)") %>%# Remove mentionsstr_remove_all("@[[:alnum:]_]*") %>%# Replace "&" character reference with "and"str_replace_all("&", "and") %>%# Remove punctuationstr_remove_all("[[:punct:]]") %>%# remove digitsstr_remove_all("[[:digit:]]") %>%# Replace any newline characters with a spacestr_replace_all("\\\n|\\\r", " ") %>%# remove strings like "<U+0001F9F5>"str_remove_all("<.*?>") %>%# Make everything lowercasestr_to_lower() %>%# Remove any trailing white space around the text and inside a stringstr_squish()}
p <- reviews %>%group_by(review_star) %>%ggplot(aes(review_star)) +geom_bar() +ggtitle("Frequency per star")ggplotly(p)
Unique set of ASIN numbers
reviews %>%select(ASIN) %>%unique()
Adding new variable book title to the reviews
reviews <- reviews %>%mutate(book_title =case_when(ASIN =="B0001DBI1Q"~"A Game of Thrones: A Song of Ice and Fire, Book 1", ASIN =="B0001MC01Y"~"A Clash of Kings: A Song of Ice and Fire, Book 2", ASIN =="B00026WUZU"~"A Storm of Swords: A Song of Ice and Fire, Book 3", ASIN =="B07ZN4WM13"~"A Feast for Crows: A Song of Ice and Fire, Book 4", ASIN =="B005C7QVUE"~"A Dance with Dragons: A Song of Ice and Fire, Book 5", ASIN =="B000BO2D64"~"Twilight: The Twilight Saga, Book 1", ASIN =="B000I2JFQU"~"New Moon: The Twilight Saga, Book 2", ASIN =="B000UW50LW"~"Eclipse: The Twilight Saga, Book 3", ASIN =="B001FD6RLM"~"Breaking Dawn: The Twilight Saga, Book 4 ", ASIN =="B07HHJ7669"~"The Hunger Games", ASIN =="B07T6BQV2L"~"Catching Fire: The Hunger Games", ASIN =="B07T43YYRY"~"Mockingjay: The Hunger Games, Book 3"))reviews
p <- reviews %>%group_by(book_title,review_star) %>%count() %>%ggplot(aes(review_star, n, color = book_title)) +geom_line() +ggtitle("Book vs Freq of each star") +xlab("Type of star") +ylab("Frequency")ggplotly(p)
The plot is not much clear, let’s plot it individually to be more clear
p <- reviews %>%group_by(book_title,review_star) %>%count() %>%ggplot(aes(review_star, n)) +geom_line() +facet_wrap(vars(book_title), ncol =2) +ggtitle("Book vs Freq of each star") +xlab("Type of star") +ylab("Frequency")ggplotly(p)
Adding new variable series title to the reviews
reviews <- reviews %>%mutate(series_title =case_when(ASIN =="B0001DBI1Q"~"A Song of Ice and Fire", ASIN =="B0001MC01Y"~"A Song of Ice and Fire", ASIN =="B00026WUZU"~"A Song of Ice and Fire", ASIN =="B07ZN4WM13"~"A Song of Ice and Fire", ASIN =="B005C7QVUE"~"A Song of Ice and Fire", ASIN =="B000BO2D64"~"The Twilight Saga", ASIN =="B000I2JFQU"~"The Twilight Saga", ASIN =="B000UW50LW"~"The Twilight Saga", ASIN =="B001FD6RLM"~"The Twilight Saga", ASIN =="B07HHJ7669"~"The Hunger Games", ASIN =="B07T6BQV2L"~"The Hunger Games", ASIN =="B07T43YYRY"~"The Hunger Games"))reviews
p <- reviews %>%group_by(series_title,review_star) %>%count() %>%ggplot(aes(review_star, n, color = series_title)) +geom_line() +ggtitle("Series vs Freq of each star") +xlab("Type of star") +ylab("Frequency")ggplotly(p)
Tokenization of data
# Conerting the text into corpustext_corpus <-corpus(c(reviews$clean_text)) # Converting the text into tokenstext_token <-tokens(text_corpus, remove_punct=TRUE, remove_numbers =TRUE) %>%tokens_select(pattern=stopwords("en"), selection="remove")text_token
# Finding the frequency of each wordword_counts <,dec=T))colnames(word_counts) <-c("Frequency")word_counts$word <-row.names(word_counts)word_counts$Rank <-c(1:ncol(text_dfm))word_counts
# Trimming the dfm text_df <-dfm_trim(text_dfm, min_termfreq =50, docfreq_type ="prop")# create fcm from dfmtext_fcm <-fcm(text_df)text_fcm
Network plot
# pull the top featurestop_features <-names(topfeatures(text_fcm, 50))# retain only those top features as part of our matrixeven_text_fcm <-fcm_select(text_fcm, pattern = top_features, selection ="keep")# compute size weight for vertices in networksize <-log(colSums(even_text_fcm))# create plottextplot_network(even_text_fcm, vertex_size = size /max(size) *2)
I will try sentimental analysis using multiple lexicons and compare which is more suitable.
Source Code
